{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# LangWatch DSPy Visualizer\n",
        "\n",
        "This notebook shows an example of a simple DSPy optimization process integrated with LangWatch for training visualization and debugging.\n",
        "\n",
        "[<img align=\"center\" src=\"https://colab.research.google.com/assets/colab-badge.svg\" />](https://colab.research.google.com/github/langwatch/langwatch/blob/main/python-sdk/examples/dspy_visualization.ipynb)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Pgy1Fjhh_lOB"
      },
      "outputs": [],
      "source": [
        "# Install langwatch along with dspy for the visualization\n",
        "!pip install dspy langwatch"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "51OWavv1CCVV"
      },
      "source": [
        "## Preparing the LLM"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xycw8IWs_qnt",
        "outputId": "40844780-608a-4162-cac7-35f57c5764f1"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "LLM test response: ['Hello! How can I assist you today?']\n"
          ]
        }
      ],
      "source": [
        "import os\n",
        "from getpass import getpass\n",
        "\n",
        "os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter your OPENAI_API_KEY: \")\n",
        "\n",
        "import dspy\n",
        "\n",
        "llm = dspy.LM(\"openai/gpt-4o-mini\", api_key=os.environ[\"OPENAI_API_KEY\"])\n",
        "\n",
        "print(\"LLM test response:\", llm(\"hello there\"))\n",
        "\n",
        "colbertv2_wiki17_abstracts = dspy.ColBERTv2(\n",
        "    url=\"http://20.102.90.50:2017/wiki17_abstracts\"\n",
        ")\n",
        "dspy.settings.configure(lm=llm, rm=colbertv2_wiki17_abstracts)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YIAYLNlcCFdO"
      },
      "source": [
        "## Preparing the Dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "NXH8qF-QBcEJ",
        "outputId": "203d4516-b9a2-4748-8fcc-7dd64be1342d"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "(32, 50)"
            ]
          },
          "execution_count": 7,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "from dspy.datasets import HotPotQA\n",
        "\n",
        "# Load the dataset.\n",
        "dataset = HotPotQA(train_seed=1, train_size=32, eval_seed=2025, dev_size=50, test_size=0)\n",
        "\n",
        "# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.\n",
        "trainset = [x.with_inputs('question') for x in dataset.train]\n",
        "devset = [x.with_inputs('question') for x in dataset.dev]\n",
        "\n",
        "len(trainset), len(devset)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "KOXtqnmfCNzS"
      },
      "source": [
        "## Defining the model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "WxAaf1IABgxM",
        "outputId": "03155b63-bbaf-4ceb-bf26-07cda4f52b6f"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "[Devset] Question: Which magazine was released first, Fortune or Motor Trend?\n",
            "[Devset] Answer: Motor Trend\n",
            "[Devset] Relevant Wikipedia Titles: {'Fortune (magazine)', 'Motor Trend'}\n",
            "[Prediction] Question: Which magazine was released first, Fortune or Motor Trend?\n",
            "[Prediction] Predicted Answer: Fortune\n"
          ]
        }
      ],
      "source": [
        "class GenerateAnswer(dspy.Signature):\n",
        "    \"\"\"Answer questions with short factoid answers.\"\"\"\n",
        "\n",
        "    context = dspy.InputField(desc=\"may contain relevant facts\")\n",
        "    question = dspy.InputField()\n",
        "    answer = dspy.OutputField(desc=\"often between 1 and 5 words\")\n",
        "\n",
        "\n",
        "class RAG(dspy.Module):\n",
        "    def __init__(self, num_passages=3):\n",
        "        super().__init__()\n",
        "\n",
        "        self.retrieve = dspy.Retrieve(k=num_passages)\n",
        "        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)\n",
        "\n",
        "    def forward(self, question):\n",
        "        context = self.retrieve(question).passages # type: ignore\n",
        "        prediction = self.generate_answer(context=context, question=question)\n",
        "        return dspy.Prediction(context=context, answer=prediction.answer)\n",
        "\n",
        "\n",
        "dev_example = devset[18]\n",
        "print(f\"[Devset] Question: {dev_example.question}\")\n",
        "print(f\"[Devset] Answer: {dev_example.answer}\")\n",
        "print(f\"[Devset] Relevant Wikipedia Titles: {dev_example.gold_titles}\")\n",
        "\n",
        "generate_answer = RAG()\n",
        "\n",
        "pred = generate_answer(question=dev_example.question)\n",
        "\n",
        "# Print the input and the prediction.\n",
        "print(f\"[Prediction] Question: {dev_example.question}\")\n",
        "print(f\"[Prediction] Predicted Answer: {pred.answer}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ytbRU9jJCSj8"
      },
      "source": [
        "## Login to LangWatch"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "lF9DxTGeCU15",
        "outputId": "29ebbacf-9ca8-4322-fd77-32e47f4c93aa"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "LangWatch API key is already set, if you want to login again, please call as langwatch.login(relogin=True)\n"
          ]
        }
      ],
      "source": [
        "import langwatch\n",
        "\n",
        "langwatch.login()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "o69S-BlkE-bV"
      },
      "source": [
        "## Start Training Session!"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Ef67q2B-FCIP",
        "outputId": "cefe1935-1ef3-46ad-a54c-74f08bfb75f0"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "[LangWatch] Experiment initialized, run_id: peculiar-marmot-of-renovation\n",
            "[LangWatch] Open http://localhost:5560/inbox-narrator/experiments/my-awesome-experiment?runIds=peculiar-marmot-of-renovation to track your DSPy training session live\n",
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:14 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
            "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n",
            "2025/03/26 22:15:14 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.\n",
            "\n",
            "2025/03/26 22:15:14 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=2 sets of demonstrations...\n",
            "2025/03/26 22:15:14 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
            "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n",
            "2025/03/26 22:15:14 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Bootstrapping set 1/2\n",
            "Bootstrapping set 2/2\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:17 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
            "Proposing instructions...\n",
            "\n",
            "2025/03/26 22:15:29 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n",
            "\n",
            "2025/03/26 22:15:29 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Answer questions with short factoid answers.\n",
            "\n",
            "2025/03/26 22:15:29 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Given the context and the question, generate a reasoned answer that is concise and factually correct, ensuring it consists of no more than 5 words.\n",
            "\n",
            "2025/03/26 22:15:29 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
            "\n",
            "2025/03/26 22:15:29 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n",
            "2025/03/26 22:15:29 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n",
            "\n",
            "2025/03/26 22:15:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 1 / 12 - Full Evaluation of Default Program ==\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "  0%|          | 0/25 [00:00<?, ?it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:29 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Which American actress who made their film debut in the 1995 teen drama \"Kids\" was the co-founder of Voto Latino?', 'answer': 'Rosario Dawson'}) (input_keys={'question'}): 'topk'. Set `provide_traceback=True` for traceback.\n",
            "2025/03/26 22:15:29 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Samantha Cristoforetti and Mark Shuttleworth are both best known for being first in their field to go where? ', 'answer': 'space'}) (input_keys={'question'}): 'topk'. Set `provide_traceback=True` for traceback.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 3.00 / 4 (75.0%):  24%|██▍       | 6/25 [00:01<00:03,  4.94it/s] "
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:30 ERROR dspy.utils.parallelizer: Error for Example({'question': 'The Organisation that allows a community to influence their operation or use and to enjoy the benefits arisingwas founded in what year?', 'answer': '2010'}) (input_keys={'question'}): 'topk'. Set `provide_traceback=True` for traceback.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 3.00 / 4 (75.0%):  24%|██▍       | 6/25 [00:01<00:03,  4.94it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:30 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Who is older, Aleksandr Danilovich Aleksandrov or Anatoly Fomenko?', 'answer': 'Aleksandr Danilovich Aleksandrov'}) (input_keys={'question'}): 'topk'. Set `provide_traceback=True` for traceback.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 4.00 / 12 (33.3%):  60%|██████    | 15/25 [00:02<00:01,  6.47it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:32 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Which band had a longer hiatus, Juliette and the Licks or The Last Shadow Puppets?', 'answer': 'The Last Shadow Puppets'}) (input_keys={'question'}): 'topk'. Set `provide_traceback=True` for traceback.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 4.00 / 12 (33.3%):  68%|██████▊   | 17/25 [00:02<00:01,  7.50it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:32 ERROR dspy.utils.parallelizer: Error for Example({'question': 'Who composed \"Sunflower Slow Drag\" with the King of Ragtime?', 'answer': 'Scott Hayden'}) (input_keys={'question'}): 'topk'. Set `provide_traceback=True` for traceback.\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 6.00 / 19 (31.6%): 100%|██████████| 25/25 [00:04<00:00,  5.20it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:34 INFO dspy.evaluate.evaluate: Average Metric: 6.0 / 25 (24.0%)\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:34 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 24.0\n",
            "\n",
            "/Users/rchaves/Projects/langwatch-saas/langwatch/python-sdk/notebook_env/lib/python3.12/site-packages/optuna/_experimental.py:31: ExperimentalWarning: Argument ``multivariate`` is an experimental feature. The interface can change in the future.\n",
            "  warnings.warn(\n",
            "2025/03/26 22:15:34 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 12 - Minibatch ==\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 8.00 / 25 (32.0%): 100%|██████████| 25/25 [00:05<00:00,  4.89it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:39 INFO dspy.evaluate.evaluate: Average Metric: 8 / 25 (32.0%)\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 32.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 0'].\n",
            "2025/03/26 22:15:40 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: []\n",
            "2025/03/26 22:15:40 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [24.0, 32.0]\n",
            "2025/03/26 22:15:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 24.0\n",
            "2025/03/26 22:15:40 INFO dspy.teleprompt.mipro_optimizer_v2: =========================================\n",
            "\n",
            "\n",
            "2025/03/26 22:15:40 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 12 - Minibatch ==\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 7.00 / 25 (28.0%): 100%|██████████| 25/25 [00:05<00:00,  4.87it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:45 INFO dspy.evaluate.evaluate: Average Metric: 7 / 25 (28.0%)\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 0'].\n",
            "2025/03/26 22:15:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: []\n",
            "2025/03/26 22:15:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [24.0, 32.0, 28.0]\n",
            "2025/03/26 22:15:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 24.0\n",
            "2025/03/26 22:15:45 INFO dspy.teleprompt.mipro_optimizer_v2: =========================================\n",
            "\n",
            "\n",
            "2025/03/26 22:15:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 12 - Minibatch ==\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 10.00 / 25 (40.0%): 100%|██████████| 25/25 [00:06<00:00,  4.16it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:51 INFO dspy.evaluate.evaluate: Average Metric: 10 / 25 (40.0%)\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 40.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].\n",
            "2025/03/26 22:15:52 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: []\n",
            "2025/03/26 22:15:52 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [24.0, 32.0, 28.0, 40.0]\n",
            "2025/03/26 22:15:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 24.0\n",
            "2025/03/26 22:15:52 INFO dspy.teleprompt.mipro_optimizer_v2: =========================================\n",
            "\n",
            "\n",
            "2025/03/26 22:15:52 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 12 - Minibatch ==\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 9.00 / 25 (36.0%): 100%|██████████| 25/25 [00:06<00:00,  4.01it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:58 INFO dspy.evaluate.evaluate: Average Metric: 9 / 25 (36.0%)\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:15:58 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 36.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 0'].\n",
            "2025/03/26 22:15:58 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: []\n",
            "2025/03/26 22:15:58 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [24.0, 32.0, 28.0, 40.0, 36.0]\n",
            "2025/03/26 22:15:58 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 24.0\n",
            "2025/03/26 22:15:58 INFO dspy.teleprompt.mipro_optimizer_v2: =========================================\n",
            "\n",
            "\n",
            "2025/03/26 22:15:58 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 12 - Minibatch ==\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 9.00 / 25 (36.0%): 100%|██████████| 25/25 [00:07<00:00,  3.39it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:16:06 INFO dspy.evaluate.evaluate: Average Metric: 9 / 25 (36.0%)\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:16:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 36.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1'].\n",
            "2025/03/26 22:16:06 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: []\n",
            "2025/03/26 22:16:06 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [24.0, 32.0, 28.0, 40.0, 36.0, 36.0]\n",
            "2025/03/26 22:16:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 24.0\n",
            "2025/03/26 22:16:06 INFO dspy.teleprompt.mipro_optimizer_v2: =========================================\n",
            "\n",
            "\n",
            "2025/03/26 22:16:06 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 7 / 12 - Minibatch ==\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 8.00 / 25 (32.0%): 100%|██████████| 25/25 [00:08<00:00,  3.00it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:16:14 INFO dspy.evaluate.evaluate: Average Metric: 8 / 25 (32.0%)\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:16:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 32.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 0'].\n",
            "2025/03/26 22:16:15 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: []\n",
            "2025/03/26 22:16:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [24.0, 32.0, 28.0, 40.0, 36.0, 36.0, 32.0]\n",
            "2025/03/26 22:16:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 24.0\n",
            "2025/03/26 22:16:15 INFO dspy.teleprompt.mipro_optimizer_v2: =========================================\n",
            "\n",
            "\n",
            "2025/03/26 22:16:15 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 12 - Minibatch ==\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 10.00 / 25 (40.0%): 100%|██████████| 25/25 [00:10<00:00,  2.36it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:16:25 INFO dspy.evaluate.evaluate: Average Metric: 10 / 25 (40.0%)\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:16:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 40.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 0'].\n",
            "2025/03/26 22:16:26 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: []\n",
            "2025/03/26 22:16:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [24.0, 32.0, 28.0, 40.0, 36.0, 36.0, 32.0, 40.0]\n",
            "2025/03/26 22:16:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 24.0\n",
            "2025/03/26 22:16:26 INFO dspy.teleprompt.mipro_optimizer_v2: =========================================\n",
            "\n",
            "\n",
            "2025/03/26 22:16:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 12 - Minibatch ==\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 10.00 / 25 (40.0%): 100%|██████████| 25/25 [00:06<00:00,  4.07it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:16:32 INFO dspy.evaluate.evaluate: Average Metric: 10 / 25 (40.0%)\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:16:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 40.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].\n",
            "2025/03/26 22:16:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: []\n",
            "2025/03/26 22:16:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [24.0, 32.0, 28.0, 40.0, 36.0, 36.0, 32.0, 40.0, 40.0]\n",
            "2025/03/26 22:16:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 24.0\n",
            "2025/03/26 22:16:33 INFO dspy.teleprompt.mipro_optimizer_v2: =========================================\n",
            "\n",
            "\n",
            "2025/03/26 22:16:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 12 - Minibatch ==\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 9.00 / 25 (36.0%): 100%|██████████| 25/25 [00:06<00:00,  3.70it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:16:39 INFO dspy.evaluate.evaluate: Average Metric: 9 / 25 (36.0%)\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:16:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 36.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1'].\n",
            "2025/03/26 22:16:40 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: []\n",
            "2025/03/26 22:16:40 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [24.0, 32.0, 28.0, 40.0, 36.0, 36.0, 32.0, 40.0, 40.0, 36.0]\n",
            "2025/03/26 22:16:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 24.0\n",
            "2025/03/26 22:16:40 INFO dspy.teleprompt.mipro_optimizer_v2: ==========================================\n",
            "\n",
            "\n",
            "2025/03/26 22:16:40 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 11 / 12 - Full Evaluation =====\n",
            "2025/03/26 22:16:40 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 40.0) from minibatch trials...\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Average Metric: 10.00 / 25 (40.0%): 100%|██████████| 25/25 [00:06<00:00,  3.67it/s]"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:16:47 INFO dspy.evaluate.evaluate: Average Metric: 10 / 25 (40.0%)\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "2025/03/26 22:16:47 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mNew best full eval score!\u001b[0m Score: 40.0\n",
            "2025/03/26 22:16:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [24.0, 32.0, 28.0, 40.0, 36.0, 36.0, 32.0, 40.0, 40.0, 36.0, 40.0]\n",
            "2025/03/26 22:16:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 40.0\n",
            "2025/03/26 22:16:47 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
            "2025/03/26 22:16:47 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
            "\n",
            "2025/03/26 22:16:47 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 40.0!\n"
          ]
        }
      ],
      "source": [
        "from dspy.teleprompt import MIPROv2\n",
        "import dspy.evaluate\n",
        "\n",
        "# Define our metric validation\n",
        "def validate_context_and_answer(example, pred, trace=None):\n",
        "    answer_EM = dspy.evaluate.answer_exact_match(example, pred)\n",
        "    answer_PM = dspy.evaluate.answer_passage_match(example, pred)\n",
        "    return answer_EM and answer_PM\n",
        "\n",
        "# Set up a MIPROv2 optimizer, which will compile our RAG program.\n",
        "optimizer = MIPROv2(metric=validate_context_and_answer, prompt_model=llm, task_model=llm, num_candidates=2, init_temperature=0.7, auto=None)\n",
        "\n",
        "# Initialize langwatch for this run, to track the optimizer compilation\n",
        "langwatch.dspy.init(experiment=\"my-awesome-experiment\", optimizer=optimizer)\n",
        "\n",
        "# Compile\n",
        "compiled_rag = optimizer.compile(\n",
        "    RAG(),\n",
        "    trainset=trainset,\n",
        "    num_trials=10,\n",
        "    max_bootstrapped_demos=3,\n",
        "    max_labeled_demos=5,\n",
        "    minibatch_size=25,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "generate_answer.predict = Predict(StringSignature(context, question -> reasoning, answer\n",
              "    instructions='Given the context and the question, generate a reasoned answer that is concise and factually correct, ensuring it consists of no more than 5 words.'\n",
              "    context = Field(annotation=str required=True json_schema_extra={'desc': 'may contain relevant facts', '__dspy_field_type': 'input', 'prefix': 'Context:'})\n",
              "    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})\n",
              "    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${reasoning}', '__dspy_field_type': 'output'})\n",
              "    answer = Field(annotation=str required=True json_schema_extra={'desc': 'often between 1 and 5 words', '__dspy_field_type': 'output', 'prefix': 'Answer:'})\n",
              "))"
            ]
          },
          "execution_count": 11,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "compiled_rag"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "u5vA80_JJX-q"
      },
      "outputs": [],
      "source": [
        "compiled_rag.save(\"optimized_model.json\")"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": [],
      "toc_visible": true
    },
    "kernelspec": {
      "display_name": "notebook_env",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
